In [38]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
In [39]:
train = pd.read_csv("./data/adult.data.csv")
In [40]:
test = pd.read_csv("./data/adult.test.csv")
In [41]:
train.head()
Out[41]:
| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
| 1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
| 2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
| 3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
| 4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
In [42]:
test.head()
Out[42]:
| age | workclass | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | Private | 11th | 7 | Never-married | Machine-op-inspct | Own-child | Black | Male | 0 | 0 | 40 | United-States | <=50K |
| 1 | 38 | Private | HS-grad | 9 | Married-civ-spouse | Farming-fishing | Husband | White | Male | 0 | 0 | 50 | United-States | <=50K |
| 2 | 28 | Local-gov | Assoc-acdm | 12 | Married-civ-spouse | Protective-serv | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
| 3 | 44 | Private | Some-college | 10 | Married-civ-spouse | Machine-op-inspct | Husband | Black | Male | 7688 | 0 | 40 | United-States | >50K |
| 4 | 18 | ? | Some-college | 10 | Never-married | ? | Own-child | White | Female | 0 | 0 | 30 | United-States | <=50K |
In [43]:
train.replace(' ?',np.nan,inplace=True)
In [44]:
train=train.dropna()
In [45]:
train
Out[45]:
| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
| 1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
| 2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
| 3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
| 4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 32556 | 27 | Private | 257302 | Assoc-acdm | 12 | Married-civ-spouse | Tech-support | Wife | White | Female | 0 | 0 | 38 | United-States | <=50K |
| 32557 | 40 | Private | 154374 | HS-grad | 9 | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
| 32558 | 58 | Private | 151910 | HS-grad | 9 | Widowed | Adm-clerical | Unmarried | White | Female | 0 | 0 | 40 | United-States | <=50K |
| 32559 | 22 | Private | 201490 | HS-grad | 9 | Never-married | Adm-clerical | Own-child | White | Male | 0 | 0 | 20 | United-States | <=50K |
| 32560 | 52 | Self-emp-inc | 287927 | HS-grad | 9 | Married-civ-spouse | Exec-managerial | Wife | White | Female | 15024 | 0 | 40 | United-States | >50K |
30162 rows × 15 columns
In [46]:
test.replace(' ?',np.nan,inplace=True)
test=test.dropna()
test
Out[46]:
| age | workclass | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | Private | 11th | 7 | Never-married | Machine-op-inspct | Own-child | Black | Male | 0 | 0 | 40 | United-States | <=50K |
| 1 | 38 | Private | HS-grad | 9 | Married-civ-spouse | Farming-fishing | Husband | White | Male | 0 | 0 | 50 | United-States | <=50K |
| 2 | 28 | Local-gov | Assoc-acdm | 12 | Married-civ-spouse | Protective-serv | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
| 3 | 44 | Private | Some-college | 10 | Married-civ-spouse | Machine-op-inspct | Husband | Black | Male | 7688 | 0 | 40 | United-States | >50K |
| 5 | 34 | Private | 10th | 6 | Never-married | Other-service | Not-in-family | White | Male | 0 | 0 | 30 | United-States | <=50K |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16275 | 33 | Private | Bachelors | 13 | Never-married | Prof-specialty | Own-child | White | Male | 0 | 0 | 40 | United-States | <=50K |
| 16276 | 39 | Private | Bachelors | 13 | Divorced | Prof-specialty | Not-in-family | White | Female | 0 | 0 | 36 | United-States | <=50K |
| 16278 | 38 | Private | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Husband | White | Male | 0 | 0 | 50 | United-States | <=50K |
| 16279 | 44 | Private | Bachelors | 13 | Divorced | Adm-clerical | Own-child | Asian-Pac-Islander | Male | 5455 | 0 | 40 | United-States | <=50K |
| 16280 | 35 | Self-emp-inc | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 60 | United-States | >50K |
15060 rows × 14 columns
In [47]:
# Xóa cột fnlwgt
del train["fnlwgt"]
In [48]:
train.head()
Out[48]:
| age | workclass | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39 | State-gov | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
| 1 | 50 | Self-emp-not-inc | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
| 2 | 38 | Private | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
| 3 | 53 | Private | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
| 4 | 28 | Private | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
In [49]:
data = pd.concat([train,test])
In [50]:
print('Number of training data: ', len(train))
print('Number of training data: ', len(test))
Number of training data: 30162 Number of training data: 15060
In [51]:
data.info()
<class 'pandas.core.frame.DataFrame'> Index: 45222 entries, 0 to 16280 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 45222 non-null int64 1 workclass 45222 non-null object 2 education 45222 non-null object 3 education-num 45222 non-null int64 4 marital-status 45222 non-null object 5 occupation 45222 non-null object 6 relationship 45222 non-null object 7 race 45222 non-null object 8 sex 45222 non-null object 9 capital-gain 45222 non-null int64 10 capital-loss 45222 non-null int64 11 hours-per-week 45222 non-null int64 12 native-country 45222 non-null object 13 income 45222 non-null object dtypes: int64(5), object(9) memory usage: 5.2+ MB
In [26]:
pip install seaborn --upgrade
Requirement already satisfied: seaborn in d:\anaconda3\envs\thktdl\lib\site-packages (0.12.2)
Collecting seaborn
Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Requirement already satisfied: numpy!=1.24.0,>=1.20 in d:\anaconda3\envs\thktdl\lib\site-packages (from seaborn) (1.26.0)
Requirement already satisfied: pandas>=1.2 in d:\anaconda3\envs\thktdl\lib\site-packages (from seaborn) (2.1.1)
Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in d:\anaconda3\envs\thktdl\lib\site-packages (from seaborn) (3.8.0)
Requirement already satisfied: contourpy>=1.0.1 in d:\anaconda3\envs\thktdl\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.2.0)
Requirement already satisfied: cycler>=0.10 in d:\anaconda3\envs\thktdl\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in d:\anaconda3\envs\thktdl\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in d:\anaconda3\envs\thktdl\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.4)
Requirement already satisfied: packaging>=20.0 in d:\anaconda3\envs\thktdl\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (23.1)
Requirement already satisfied: pillow>=6.2.0 in d:\anaconda3\envs\thktdl\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (10.2.0)
Requirement already satisfied: pyparsing>=2.3.1 in d:\anaconda3\envs\thktdl\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in d:\anaconda3\envs\thktdl\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in d:\anaconda3\envs\thktdl\lib\site-packages (from pandas>=1.2->seaborn) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in d:\anaconda3\envs\thktdl\lib\site-packages (from pandas>=1.2->seaborn) (2023.3)
Requirement already satisfied: six>=1.5 in d:\anaconda3\envs\thktdl\lib\site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.16.0)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
---------------------------------------- 0.0/294.9 kB ? eta -:--:--
---------------------------------------- 0.0/294.9 kB ? eta -:--:--
- -------------------------------------- 10.2/294.9 kB ? eta -:--:--
- -------------------------------------- 10.2/294.9 kB ? eta -:--:--
---- ---------------------------------- 30.7/294.9 kB 217.9 kB/s eta 0:00:02
----- --------------------------------- 41.0/294.9 kB 217.9 kB/s eta 0:00:02
-------------- ----------------------- 112.6/294.9 kB 467.6 kB/s eta 0:00:01
-------------- ----------------------- 112.6/294.9 kB 467.6 kB/s eta 0:00:01
-------------- ----------------------- 112.6/294.9 kB 467.6 kB/s eta 0:00:01
-------------------------------- ----- 256.0/294.9 kB 749.3 kB/s eta 0:00:01
-------------------------------------- 294.9/294.9 kB 699.4 kB/s eta 0:00:00
Installing collected packages: seaborn
Attempting uninstall: seaborn
Found existing installation: seaborn 0.12.2
Uninstalling seaborn-0.12.2:
Successfully uninstalled seaborn-0.12.2
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.
In [16]:
df2 = data.select_dtypes(include=['int64'])
plt.figure(figsize=(16,9))
sns.heatmap(df2.corr(method='pearson'),annot=True)
Out[16]:
<Axes: >
In [17]:
feature = data.drop('income',axis=1)
label = data['income']
In [18]:
feature.select_dtypes(exclude=['int64']).columns
Out[18]:
Index(['workclass', 'education', 'marital-status', 'occupation',
'relationship', 'race', 'sex', 'native-country'],
dtype='object')
In [19]:
feature_onehot = pd.get_dummies(feature,columns=feature.select_dtypes(exclude=['int64']).columns)
feature_onehot
Out[19]:
| age | education-num | capital-gain | capital-loss | hours-per-week | workclass_ Federal-gov | workclass_ Local-gov | workclass_ Private | workclass_ Self-emp-inc | workclass_ Self-emp-not-inc | ... | native-country_ Portugal | native-country_ Puerto-Rico | native-country_ Scotland | native-country_ South | native-country_ Taiwan | native-country_ Thailand | native-country_ Trinadad&Tobago | native-country_ United-States | native-country_ Vietnam | native-country_ Yugoslavia | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39 | 13 | 2174 | 0 | 40 | False | False | False | False | False | ... | False | False | False | False | False | False | False | True | False | False |
| 1 | 50 | 13 | 0 | 0 | 13 | False | False | False | False | True | ... | False | False | False | False | False | False | False | True | False | False |
| 2 | 38 | 9 | 0 | 0 | 40 | False | False | True | False | False | ... | False | False | False | False | False | False | False | True | False | False |
| 3 | 53 | 7 | 0 | 0 | 40 | False | False | True | False | False | ... | False | False | False | False | False | False | False | True | False | False |
| 4 | 28 | 13 | 0 | 0 | 40 | False | False | True | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16275 | 33 | 13 | 0 | 0 | 40 | False | False | True | False | False | ... | False | False | False | False | False | False | False | True | False | False |
| 16276 | 39 | 13 | 0 | 0 | 36 | False | False | True | False | False | ... | False | False | False | False | False | False | False | True | False | False |
| 16278 | 38 | 13 | 0 | 0 | 50 | False | False | True | False | False | ... | False | False | False | False | False | False | False | True | False | False |
| 16279 | 44 | 13 | 5455 | 0 | 40 | False | False | True | False | False | ... | False | False | False | False | False | False | False | True | False | False |
| 16280 | 35 | 13 | 0 | 0 | 60 | False | False | False | True | False | ... | False | False | False | False | False | False | False | True | False | False |
45222 rows × 103 columns
In [20]:
x_train = feature_onehot[:30162]
x_test = feature_onehot[30162:]
y_train=label[:30162]
y_test=label[30162:]
In [21]:
clf = tree.DecisionTreeClassifier(criterion="entropy",random_state=0)
clf.fit(x_train,y_train)
Out[21]:
DecisionTreeClassifier(criterion='entropy', random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(criterion='entropy', random_state=0)
In [22]:
tree_pred = clf.predict(x_test)
tree_score = metrics.accuracy_score(y_test,tree_pred)
print("Accruracy:",tree_score)
print("Report:",metrics.classification_report(y_test,tree_pred))
Accruracy: 0.8175298804780876
Report: precision recall f1-score support
<=50K 0.88 0.88 0.88 11360
>50K 0.63 0.62 0.62 3700
accuracy 0.82 15060
macro avg 0.75 0.75 0.75 15060
weighted avg 0.82 0.82 0.82 15060
In [23]:
tree_cm = metrics.confusion_matrix(y_test,tree_pred)
In [24]:
plt.figure(figsize=(12,12))
sns.heatmap(tree_cm,annot=True, fmt=".3f",linewidth=.5,square=True,cmap='Blues_r');
plt.xlabel('Actual Label');
plt.ylabel('Predicted Label');
title ='Decision Tree Accuracy Score:{0}'.format(tree_score)
plt.title(title,size=15);
In [25]:
fig, ax = plt.subplots(figsize=(50,24))
tree.plot_tree(clf,filled=True,fontsize=10)
plt.savefig('decision_tree',dpi=100)
plt.show()
In [52]:
# Câu 8 thay thế giá trị criterion=’gini’.
clf = tree.DecisionTreeClassifier(criterion="gini",random_state=0)
clf.fit(x_train,y_train)
Out[52]:
DecisionTreeClassifier(random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(random_state=0)
In [53]:
tree_pred = clf.predict(x_test)
tree_score = metrics.accuracy_score(y_test,tree_pred)
print("Accruracy:",tree_score)
print("Report:",metrics.classification_report(y_test,tree_pred))
Accruracy: 0.8122841965471448
Report: precision recall f1-score support
<=50K 0.87 0.88 0.88 11360
>50K 0.62 0.60 0.61 3700
accuracy 0.81 15060
macro avg 0.75 0.74 0.74 15060
weighted avg 0.81 0.81 0.81 15060
In [54]:
tree_cm = metrics.confusion_matrix(y_test,tree_pred)
In [55]:
plt.figure(figsize=(12,12))
sns.heatmap(tree_cm,annot=True, fmt=".3f",linewidth=.5,square=True,cmap='Pastel1');
plt.xlabel('Actual Label');
plt.ylabel('Predicted Label');
title ='Decision Tree Accuracy Score:{0}'.format(tree_score)
plt.title(title,size=15);
In [56]:
fig, ax = plt.subplots(figsize=(50,24))
tree.plot_tree(clf,filled=True,fontsize=10)
plt.savefig('decision_tree',dpi=100)
plt.show()
In [57]:
gnb = GaussianNB()
In [58]:
bayes_pred = gnb.fit(x_train, y_train).predict(x_test)
bayes_score = metrics.accuracy_score(y_test, bayes_pred)
print("Accuracy: ", bayes_score)
print("Report: ", metrics.classification_report(y_test, bayes_pred))
Accuracy: 0.8029216467463479
Report: precision recall f1-score support
<=50K 0.93 0.80 0.86 11360
>50K 0.57 0.82 0.67 3700
accuracy 0.80 15060
macro avg 0.75 0.81 0.76 15060
weighted avg 0.84 0.80 0.81 15060
In [59]:
bayes_cm = metrics.confusion_matrix(y_test, bayes_pred)
plt.figure(figsize=(12,12))
sns.heatmap(bayes_cm,annot=True, fmt=".3f",linewidth=.5,square=True,cmap='Greens');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
title = 'Native Bayes Accuracy Score: {0}'.format(bayes_score)
plt.title(title, size=15);
Độ chính xác:¶
- Criterion="gini": 0.8122841965471448
- Criterion="entropy": 0.8175298804780876
- Native Bayes: 0.8029216467463479
Nhận xét:¶
Cả hai mô hình đều đạt độ chính xác cao trên 80%, tuy nhiên mô hình sử dụng criterion="entropy" có độ chính xác cao hơn một chút (0.5% . Sự chênh lệch này khá nhỏ và có thể không đáng kể về mặt thực tế.876
Kết luận:¶
Cả hai mô hình DecisionTreeClassifier sử dụng criterion="gini" và criterion="entropy" đều đạt hiệu quả cao trong việc phân loại dữ liệu thu nhập.
In [ ]: